home *** CD-ROM | disk | FTP | other *** search
- (********************************************************************)
- (* Collate.PAS *)
- (* Collation table class *)
- (* *)
- (* (c) Julian M Bucknall, 1997 *)
- (********************************************************************)
-
- { Notes:
- There are two classes in this unit: TSortString and TCollation. The
- former class is an internal class used to store the sort values for
- a string, which was converted with a TCollation class instance. The
- TCollation class is the interesting one: it defines a complete
- collation table.
-
- Some documentation for TCollation:
- constructor Create;
- - creates an instance of a collation table. The table, by default,
- is a simple binary collation (ie, it works in a similar manner
- to the Delphi < string operator).
-
- destructor Destroy; override;
- - destroys an instance of a collation table
-
- procedure LoadFromStream(Stream : TStream);
- - loads the collation table data froma stream.
-
- procedure LoadFromFile(FileName : string);
- - loads the collation table data from a collation file. These \
- files have extension CLL.
-
- function CompareStrings(const S1, S2 : string) : integer;
- - compares two strings according to the collation table. The
- result is <0 if S1 < S2, 0 if they compare equal, >0 otherwise.
-
- function CompareSortStrings(const SS1, SS2 : TSortString) : integer;
- - compares two sort strings previously converted from strings
- using ConvertText. The result is <0 if S1 < S2, 0 if they
- compare equal, >0 otherwise.
-
- function ConvertText(const TextStr : string) : TSortString;
- - converts a string into its sort string equivalent. The method
- creates a new instance of a TSortString, and you are responsible
- for destroying it by calling its destructor via Destroy or Free
- once you no longer need it.
-
- property Description : string
- - a read-only property that is the description of the collation
- table. This is retrieved from the collation file.
-
- property IsBinary : boolean
- - a read-only property that is true if the collation table is a
- simple binary collation, or false if it is not.
-
- Limitations:
- - at present the only way to provide data for a collation table is
- via the LoadFromFile method.
-
- Collation files:
- The collation file layout is a stream-based one. Conceptually it
- starts with the following format:
-
- <description string>
- ^Z character
- <boolean for whether the collation is binary or not>
-
- If the collation table is binary the file ends there. For a non-
- binary collation, it continues:
-
- <count of ligatures as a byte>
- <array of TLigature records (see below)>
- <count of double characters as a byte>
- <array of TDoubleChar records (see below)>
- <256 bytes for TSortValues instance>
-
- Obviously if one of the two counts is zero, the corresponding array
- does not exist; the the number of elements in the array in the
- stream equals the corresponding count.
- }
-
- unit Collate;
-
- interface
-
- uses
- Classes;
-
- type
- PSortData = ^TSortData;
- TSortData = packed record {Internal representation of a sort string}
- sdSize : longint;
- sdLen : longint;
- sdVals : array [0..1023] of byte;
- end;
-
- TSortString = class {A sort string class}
- protected {private}
- ssData : PSortData;
- protected
- function GetLength : longint;
- function GetSize : longint;
- function GetValuePtr : pointer;
- procedure SetLength(L : longint);
- public
- constructor Create(aSize : longint);
- destructor Destroy; override;
- function Compare(aSS : TSortString) : integer;
- {Compare with another sort string; return <0 if self is less
- than aSS, 0 if equal, >0 otherwise}
- procedure Grow(aSize : longint);
- {Grow the sort string to accomodate aSize values}
- procedure Minimize;
- {Minimize the sort string so that its size matches its length}
- property Size : longint
- read GetSize;
- {Size of the sort string, ie max num of values}
- property Length : longint
- read GetLength write SetLength;
- {Length of the sort string, < Size}
- property ValuePtr : pointer
- read GetValuePtr;
- {Pointer to array of sort values}
- end;
-
- type
- TLigature = packed record
- lLig : AnsiChar; {ligature character}
- lSVal1 : byte; {sort value of first character}
- lSVal2 : byte; {sort value of second character}
- end;
- PLigatureArray = ^TLigatureArray;
- TLigatureArray = array [0..255] of TLigature;
-
- TDoubleChar = packed record
- dcChar1 : AnsiChar; {first character}
- dcChar2 : AnsiChar; {second character}
- dcSortValue : byte; {sort value of double character}
- end;
- PDoubleCharArray = ^TDoubleCharArray;
- TDoubleCharArray = array [0..255] of TDoubleChar;
-
- TSortValues = array [0..255] of byte;
-
- TCollation = class
- protected {private}
- FBinary : boolean;
- FDesc : string;
- FLigCount : integer;
- FLigArray : PLigatureArray;
- FDCACount : integer;
- FDblArray : PDoubleCharArray;
- FSortVals : TSortValues;
-
- SS1 : TSortString;
- SS2 : TSortString;
- protected
- procedure clConvertText(const S : string;
- const SS : TSortString);
- public
- constructor Create;
- destructor Destroy; override;
-
- procedure LoadFromStream(Stream : TStream);
- procedure LoadFromFile(FileName : string);
-
- function CompareStrings(const S1, S2 : string) : integer;
- function CompareSortStrings(const SS1, SS2 : TSortString) : integer;
- function ConvertText(const TextStr : string) : TSortString;
-
- property Description : string
- read FDesc;
- property IsBinary : boolean
- read FBinary;
- end;
-
- implementation
-
- uses
- SysUtils;
-
- {===Sort String Routines=============================================}
- constructor TSortString.Create(aSize : longint);
- begin
- {inherited Create;}
- GetMem(ssData, (2 * sizeof(longint)) + aSize);
- ssData^.sdSize := aSize;
- ssData^.sdLen := 0;
- end;
- {--------}
- destructor TSortString.Destroy;
- begin
- if (ssData <> nil) then
- FreeMem(ssData, (2 * sizeof(longint)) + ssData^.sdSize);
- {inherited Destroy;}
- end;
- {--------}
- function TSortString.Compare(aSS : TSortString) : integer;
- asm
- {EAX = Self}
- {EDX = aSS}
- push ebx
- push esi
- push edi
- push ebp
-
- mov esi, [eax].TSortString.ssData
- mov ebp, [esi+4] {ebp = length self sort data}
- add esi, 8 {esi => self sort data}
- mov edi, [edx].TSortString.ssData
- mov edx, [edi+4] {edx = length aSS sort data}
- add edi, 8 {edi => aSS sort data}
-
- xor eax, eax {assume equal}
-
- mov ecx, ebp {calculate the smaller length}
- cmp ecx, edx
- jb @@GotMinLength
- mov ecx, edx
-
- @@GotMinLength:
- push ecx {save shorter length}
- shr ecx, 2 {divide by 4}
- jz @@DoRemainder
- @@EightByteLoop: {compare bytes 8 at a time}
- mov ebx, [esi]
- cmp ebx, [edi]
- jne @@RecompareBytes
- dec ecx
- jz @@DoRemainderPlus4
- mov ebx, [esi+4]
- cmp ebx, [edi+4]
- jne @@RecompareBytesPlus4
- add esi, 8
- add edi, 8
- dec ecx
- jnz @@EightByteLoop
- jmp @@DoRemainder
-
- @@RecompareBytesPlus4: {mismatch - recompare bytes}
- add esi, 4
- add edi, 4
- @@RecompareBytes:
- pop ecx {get shorter length & discard}
- mov ecx, 4
- jmp @@CompareBytes
-
- @@DoRemainderPlus4: {do remaining bytes}
- add esi, 4
- add edi, 4
- @@DoRemainder:
- pop ecx {get shorter length}
- and ecx, 3 {..mod 4}
- jz @@EQ
-
- @@CompareBytes: {compare up to 4 bytes}
- mov bl, [esi] {first byte}
- cmp bl, [edi]
- jb @@LT
- ja @@GT
- dec ecx
- jz @@EQ
- mov bl, [esi+1] {second byte}
- cmp bl, [edi+1]
- jb @@LT
- ja @@GT
- dec ecx
- jz @@EQ
- mov bl, [esi+2] {third byte}
- cmp bl, [edi+2]
- jb @@LT
- ja @@GT
- dec ecx
- jz @@EQ
- mov bl, [esi+3] {fourth byte, only on mismatch}
- cmp bl, [edi+3]
- jb @@LT
- ja @@GT
-
- @@EQ: {bytes are equal}
- cmp ebp, edx {compare old lengths}
- jb @@LT
- ja @@GT
- inc eax
-
- @@LT:
- dec eax
- dec eax
-
- @@GT:
- inc eax
-
- @@Exit:
- pop ebp
- pop edi
- pop esi
- pop ebx
- end;
- {--------}
- function TSortString.GetLength : longint;
- begin
- Result := ssData^.sdLen;
- end;
- {--------}
- function TSortString.GetSize : longint;
- begin
- Result := ssData^.sdSize;
- end;
- {--------}
- function TSortString.GetValuePtr : pointer;
- begin
- Result := @ssData^.sdVals;
- end;
- {--------}
- procedure TSortString.Grow(aSize : longint);
- begin
- if (ssData <> nil) and (aSize > ssData^.sdSize) then begin
- ReallocMem(ssData, (2 * sizeof(longint)) + aSize);
- ssData^.sdSize := aSize;
- end;
- end;
- {--------}
- procedure TSortString.Minimize;
- begin
- if (ssData <> nil) and (ssData^.sdLen < ssData^.sdSize) then begin
- ReallocMem(ssData, (2 * sizeof(longint)) + ssData^.sdLen);
- ssData^.sdSize := ssData^.sdLen;
- end;
- end;
- {--------}
- procedure TSortString.SetLength(L : longint);
- begin
- ssData^.sdLen := L;
- end;
- {====================================================================}
-
- {===TCollation=======================================================}
- constructor TCollation.Create;
- begin
- inherited Create;
- {preallocate two buffers for converted strings, 512 bytes should be
- ample for the vast majority of cases}
- SS1 := TSortString.Create(512);
- SS2 := TSortString.Create(512);
- {initialize the sortvalues array}
- FillChar(FSortVals, sizeof(FSortVals), 0);
- {the collation is binary to begin with}
- FBinary := true;
- end;
- {--------}
- destructor TCollation.Destroy;
- begin
- {destroy our buffers}
- SS1.Free;
- SS2.Free;
- {destroy our ligature and doublechar arrays}
- if (FLigCount <> 0) and (FLigArray <> nil) then
- FreeMem(FLigArray, FLigCount * sizeof(TLigature));
- if (FDCACount <> 0) and (FDblArray <> nil) then
- FreeMem(FDblArray, FDCACount * sizeof(TDoubleChar));
- {continue the destroy}
- inherited Destroy;
- end;
- {--------}
- procedure TCollation.clConvertText(const S : string;
- const SS : TSortString);
- var
- ChInx : integer;
- i : integer;
- ConvCount : integer;
- ConvVals : PByteArray;
- Ch : AnsiChar;
- UsedLigature : boolean;
- UsedDouble : boolean;
- CheckDoubles : boolean;
- begin
- {set up some variables ready for the loop}
- ConvCount := 0;
- ConvVals := PByteArray(SS.ValuePtr);
- UsedLigature := false;
- UsedDouble := false;
- CheckDoubles := FDCACount <> 0;
- {convert each character into its sort value equivalent}
- for ChInx := 1 to length(S) do begin
- {save time: get the character into a local variable}
- Ch := S[ChInx];
- {check our ligatures}
- for i := 0 to pred(FLigCount) do begin
- if (Ch = FLigArray[i].lLig) then begin
- UsedLigature := true;
- ConvVals[ConvCount] := FLigArray[i].lSVal1;
- ConvVals[ConvCount+1] := FLigArray[i].lSVal2;
- inc(ConvCount, 2);
- Break;
- end;
- end;
- if UsedLigature then begin
- UsedLigature := false;
- end
- else {a ligature was not found} begin
- {check our double characters, if required}
- if CheckDoubles then begin
- if (ChInx < length(S)) then begin
- for i := 0 to pred(FDCACount) do begin
- if (Ch = FDblArray[i].dcChar1) and
- (S[ChInx+1] = FDblArray[i].dcChar2) then begin
- UsedDouble := true;
- ConvVals[ConvCount] := FDblArray[i].dcSortValue;
- inc(ConvCount);
- Break;
- end;
- end;
- end;
- if UsedDouble then begin
- UsedDouble := false;
- end
- else {a double char was not found} begin
- ConvVals[ConvCount] := FSortVals[ord(Ch)];
- inc(ConvCount);
- end;
- end
- else {no double chars to check for} begin
- ConvVals[ConvCount] := FSortVals[ord(Ch)];
- inc(ConvCount);
- end;
- end;
- end;
- {set the length of the sort string}
- SS.Length := ConvCount;
- end;
- {--------}
- function TCollation.CompareSortStrings(const SS1, SS2 : TSortString) : integer;
- begin
- Result := SS1.Compare(SS2);
- end;
- {--------}
- function TCollation.CompareStrings(const S1, S2 : string) : integer;
- begin
- if IsBinary then
- {if it's a binary collation, the comparison is a simple binary
- comparison}
- Result := SysUtils.CompareStr(S1, S2)
- else begin
- {otherwise there's some conversion to do; convert the strings}
- if (SS1.Size < (length(S1) * 2)) then
- SS1.Grow(length(S1) * 2);
- clConvertText(S1, SS1);
- if (SS2.Size < (length(S2) * 2)) then
- SS2.Grow(length(S1) * 2);
- clConvertText(S2, SS2);
- {compare the two sort strings}
- Result := SS1.Compare(SS2);
- end;
- end;
- {--------}
- function TCollation.ConvertText(const TextStr : string) : TSortString;
- var
- TextLen : integer;
- begin
- if IsBinary then begin
- {if it's a binary collation, the converted string is the same as
- the original}
- TextLen := length(TextStr);
- Result := TSortString.Create(TextLen);
- Move(TextStr[1], Result.ValuePtr^, TextLen);
- Result.Length := TextLen;
- end
- else begin
- {otherwise there's some conversion to do; preallocate the
- converted string result - we assume every character is a
- ligature}
- Result := TSortString.Create(length(TextStr) * 2);
- {convert the string}
- clConvertText(TextStr, Result);
- {tidy up by setting the correct length of the result}
- Result.Minimize;
- end;
- end;
- {--------}
- procedure TCollation.LoadFromFile(FileName : string);
- var
- S : TStream;
- begin
- S := TFileStream.Create(FileName, fmOpenRead + fmShareDenyWrite);
- try
- LoadFromStream(S);
- finally
- S.Free;
- end;{try..finally}
- end;
- {--------}
- procedure TCollation.LoadFromStream(Stream : TStream);
- var
- i : integer;
- ActualLen : integer;
- NumBytes : integer;
- StrmPos : longint;
- begin
- with Stream do begin
- SetLength(FDesc, 256);
- StrmPos := Position;
- Read(FDesc[1], 256);
- ActualLen := 0;
- for i := 1 to 256 do
- if (FDesc[i] = ^Z) then begin
- ActualLen := i-1;
- Break;
- end;
- SetLength(FDesc, ActualLen);
- Position := StrmPos + ActualLen + 1;
- Read(FBinary, sizeof(FBinary));
- if not IsBinary then begin
- FLigCount := 0;
- Read(FLigCount, sizeof(byte));
- if (FLigCount <> 0) then begin
- NumBytes := FLigCount * sizeof(TLigature);
- GetMem(FLigArray, NumBytes);
- Read(FLigArray^, NumBytes);
- end;
- FDCACount := 0;
- Read(FDCACount, sizeof(byte));
- if (FDCACount <> 0) then begin
- NumBytes := FDCACount * sizeof(TDoubleChar);
- GetMem(FDblArray, NumBytes);
- Read(FDblArray^, NumBytes);
- end;
- Read(FSortVals, sizeof(FSortVals));
- end;
- end;
- end;
- {====================================================================}
-
- end.
-